00_all

Please write quarto render r/00_all.qmd in the terminal to render all .qmd documents and generates a html that contains the full project. Make sure quatro is installed for this: https://quarto.org/docs/get-started/

01_load

library("tidyverse")

Making sure the directories exist

raw_dir <- "../data/_raw/"

if( !dir.exists(raw_dir) ){
  dir.create(path = raw_dir)
}

Please see the README to for the data retrieval and correct placement.

raw_data <- read_csv("../data/_raw/SAA_Biospecimen_Analysis_Results_01Nov2025.csv")

Saving data-frame

write_tsv(raw_data, "../data/01_dat_load.tsv")

02_clean

library(tidyverse)

Loading the raw data

raw_data <- read_tsv("../data/01_dat_load.tsv")

Clean the data

Selecting only PD patients:

SWEDD is an outdated ID, mentioned in the PPMI data guide, and we will not be using the data from these patients.

tidy_data <- raw_data |>
  filter(COHORT != "SWEDD")

Deleting columns

Some columns in the data-set are not part of the chosen analytic points. The columns chosen to omit is at the cause are either:

  • Irrelevant to the analysis such as the name of the institution or the responsible practitioner.

  • Contain the same value for all observation such as the TYPE (which is always a cerebrospinal fluid sample) and SAAMethod (results always made on the basis of a SAA).

  • Only have a value in observed in one of the projects: SAA_Type, TSmax, T50.

  • Lack meassurements for most observations like SampleVolume.

# Neglecting the unused columns:
tidy_data <- tidy_data |>
  select(-"COHORT",
         -"PI_NAME",
         -"PI_INSTITUTION",
         -"TYPE",
         -"SAAMethod",
         -"SAA_Type",
         -starts_with("TSmax"),
         -starts_with("SLOPE"),
         -starts_with("T50"),
         -starts_with("Sample"))

Missing value interpretation

One person (patient_number = 41184) has the value “U01” in the clinical event column. As no information is found explaining the value, the patient is removed from the data.

tidy_data <- tidy_data |>
  filter(CLINICAL_EVENT != "U01")

Elongating the data

Originally all repetitions of Fmax, TTT and AUC had its own column. To make the data more comprehensible all repetitions are stored in one column for each measurement. The data will then contain only one column of each measurement (Fmax/AUC/TTT), a column with the duration (either 24 or 150 hours) and a column defining the repetition number (1-3) of the measurement.

tidy_data <- tidy_data |> 
  pivot_longer(
    cols = matches("Fmax|TTT|AUC"), 
    names_to = c("measurement", "duration", "rep"), 
    names_pattern = ("(Fmax|TTT|AUC)_(\\d+)h_Rep(\\d+)"),
    values_to = "value",
    values_drop_na = TRUE
  ) |> 
  
  pivot_wider(
    names_from = measurement,
    values_from = value
  )

Reducing the “Instrument” columns

When looking at the data it seemed that the instrument used for one observation for all three repetitions was the same. We found that it was the case for all observations when checking:

all(
  tidy_data$InstrumentRep1 == tidy_data$InstrumentRep2 & tidy_data$InstrumentRep2 == tidy_data$InstrumentRep3
  )
[1] TRUE

To reduce the number of columns, the instrument is stored in a single column.

tidy_data <- tidy_data |> 
  select(!InstrumentRep2 & !InstrumentRep3) |> 
  rename(instrument = InstrumentRep1)

Renaming the columns to preferred naming convention

tidy_data <- tidy_data |>
  rename(
    patient_number = PATNO,
    sex = SEX,
    project = PROJECTID,
    clinical_event = CLINICAL_EVENT,
    saa_result = SAA_Status,
    rundate = RUNDATE,
    fmax = Fmax,
    ttt = TTT,
    auc = AUC)
tidy_data |> 
  slice_sample(n = 10)
# A tibble: 10 × 12
   patient_number sex    clinical_event saa_result instrument rundate    project
            <dbl> <chr>  <chr>          <chr>           <dbl> <date>       <dbl>
 1         110219 Male   V04            Negative            4 2023-12-18     237
 2           4027 Male   V04            Negative            2 2023-12-18     237
 3           4020 Male   V04            Positive            6 2024-12-10     237
 4           3514 Male   BL             Positive            6 2024-12-16     237
 5           3825 Male   V04            Positive            5 2024-10-30     237
 6         251185 Male   BL             Positive            9 2024-04-22     237
 7           4102 Male   BL             Positive            4 2022-03-24     155
 8         203717 Female BL             Positive            2 2023-09-29     237
 9         157589 Male   V06            Negative            6 2025-05-28     237
10           4057 Male   BL             Positive            2 2022-05-19     155
# ℹ 5 more variables: duration <chr>, rep <chr>, fmax <dbl>, ttt <dbl>,
#   auc <dbl>

Changing the column-types

source("09_proj_func.R")
tidy_data <- char_type_col(tidy_data)

Relocating columns

The columns containing important information about an observation was moved to the front.

tidy_data <- tidy_data |>
    relocate(project)

tidy_data <- tidy_data |>
    relocate(duration:auc, .after = saa_result)

Cluster the patients’ observations

Some patients have multiple observations and uses the patient_number as an identifier. One patients observations are all gathered together by arranging the patient_number and individually put in order depending on their visit.

tidy_data <- tidy_data |>
  arrange(patient_number, rundate)

Save the cleaned data

write_tsv(tidy_data, "../data/02_dat_clean.tsv")

03_augment

library(tidyverse)

Loading the clean data

tidy_data <- read_tsv("../data/02_dat_clean.tsv")

Augment Data

Finding days from baseline

Some patients were tested multiple times at different dates. We calculate the days from baseline and add a colum with this information.

It should be noted that for 2 patients (40771 and 40754) the rundate appears to have been inputted wrong, so that it appears as if several samples were taken on just one day, despite them having different clinical_events. Therefore, it is insufficient to just sort by days_from_baseline, and clinical_events is therefore kept.

tidy_data <- tidy_data |> 
  mutate(rundate = as.Date(rundate)) |> 
  group_by(patient_number) |> 
  mutate(days_from_baseline = as.numeric(rundate - min(rundate, 
                                                       na.rm = TRUE)
                                         )) |> 
  ungroup()

Calculating means of Fmax, ttt and auc

Instead of working with three replicates of a single observation, moving forward the mean of the variable will be used to represent the observation.

The means are calculated of Fmax, ttt and auc from a single patient (patient_number) per visit (clinical_event).

means <- tidy_data |>
  group_by(patient_number, clinical_event, days_from_baseline) |>
  
  summarise(
    fmax_mean = mean(fmax, na.rm = TRUE),
    ttt_mean  = mean(ttt, na.rm = TRUE),
    auc_mean  = mean(auc, na.rm = TRUE),
    .groups = "drop"
  )

aug_data <- tidy_data |>
  left_join(means,
            by = c("patient_number", 
                   "clinical_event", 
                   "days_from_baseline"))

Initiating a quality check

When PPMI performed their study, they choose a criteria the Fmax value must meet, for the SAA-result to indicate PD. To ensure they correctly assigned positive/negative/inconclusive, the saa_results to the observations, a quality check is made.

As the two projects differ in method, they used different requirements for each.

Result Project 155 - Criteria of Fmax
Positive All 3 replicates have Fmax ≥ 5,000 RFU
Negative 0 or 1 replicate has Fmax ≥ 5,000 RFU
Inconclusive Exactly 2 replicates have Fmax ≥ 5,000 RFU
Result Project 237 - Criteria of Fmax
Positive
  • All 3 replicates have Fmax ≥ 45,000 RFU

    or

  • Exactly 2 or 3 replicates have Fmax ≥ 3,000 RFU & < 45,000 RFU

    or

  • Exactly 2 replicates have Fmax ≥ 45,000 RFU & 1 replicate has Fmax ≥ 3,000 RFU & < 45,000 RFU

Negative
  • 0 or 1 replicate has Fmax ≥ 3,000 RFU
Inconclusive
  • Exactly 2 replicates have Fmax ≥ 45,000 RFU & 1 replicate has Fmax < 3,000 RFU

    or

  • Exactly 1 replicate has Fmax ≥ 45,000 RFU & 1 replicate has Fmax ≥ 3,000 RFU & < 45,000 RFU & 1 replicate has Fmax < 3,000 RFU

To prepare replicate vectors for SAA rule all three fmax from each observation is gathered in a list.

source("09_proj_func.R")

reps <- tidy_data |>
  group_by(patient_number, 
           project, 
           clinical_event, 
           days_from_baseline) |> 
  
  summarise(
    fmax_reps = list(fmax),
    .groups = "drop"
  )

aug_data <- aug_data |> 
  left_join(reps,
            by = c("patient_number", 
                   "project", 
                   "clinical_event",
                   "days_from_baseline"))

The two rules have been made as functions,

Applying the functions and subsequently removing the generated fmax_reps column:

source("09_proj_func.R")

aug_data <- aug_data |> 
  rowwise() |> 
  
  mutate(
    saa_custom = case_when(
      project == "155" ~ saa_rule_155(fmax_reps),
      project == "237" ~ saa_rule_237(fmax_reps),
      TRUE ~ NA_character_
    )
  ) |> 
  
  ungroup()

aug_data <- aug_data |>  
  select(-fmax_reps)

Save the augmented data

write_tsv(aug_data, "../data/03_aug_data.tsv")

04_describe

library(tidyverse)

Loading the augmented data

aug_data <- read_tsv("../data/03_aug_data.tsv")

Describing the data

To ensure uniformity across the different plots a general theme has been made:

  theme_custom <- theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 13, margin = margin(b = 7)),
    plot.subtitle = element_text(size = 10, margin = margin(b = 12)),
    plot.caption = element_text(size = 8, margin = margin(t = 20)),
      
    axis.title.x = element_text(size = 10, margin = margin(t = 10)),
    axis.title.y = element_text(size = 10, margin = margin(r = 10)),

    axis.text.x = element_text(size = 10, margin = margin(t = 5)),
    axis.text.y = element_text(size = 10, margin = margin(r = 5)),
    
    legend.position = "none",
    )

Distribution of observations between projects

The data contains observations made in two projects. The distribution from each projects can be seen in the plot as well as the male/female ratio.

project_distribution <- aug_data |>
  mutate(project = as.character(project)) |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = sex)) + 
  scale_fill_manual(values = c("#da9283","#3C3C68"),
                    name = "Sex") +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Project Distribution",
       subtitle = "The distribution of observations and gender across project 155 and 237",
       x = "Project",
       y = "Observations",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/04_project_distribution.png",
       width = 10,  
       height = 6)

project_distribution

Individual participants

The data-set orignially contained more than 2000 observations. Some of the patients appear multiple times in the data set. The amount of different patients enrolled in the experiment is determined. Obs: we removed one patient during the cleaning because of a missing value interpretation.

patients_participants <- aug_data |>
  distinct(patient_number) |>
  summarize(Participants = n())
patients_participants
# A tibble: 1 × 1
  Participants
         <int>
1         1300

Number of observations

1 observation is rep 1-3

observations <- aug_data |>
  group_by(clinical_event) |>
  distinct(patient_number) |>
  ungroup()|>
  summarize(Observations = n())
observations
# A tibble: 1 × 1
  Observations
         <int>
1         1889

Result of the SAA

All patients enrolled was diagnosed with PD previously to SAA analysis. The result of the SAA does not follow the PD diagnosis completely. The number of saa-positive patients at their latest visit.

aug_data |>
  arrange(desc(days_from_baseline)) |>
  distinct(patient_number, .keep_all = TRUE) |>
  filter(saa_result == "Positive") |>
  summarize(Positive_individuals = n())
# A tibble: 1 × 1
  Positive_individuals
                 <int>
1                 1149

Saa-result from all observations, patient repeats accepted

result_distribution <- aug_data |>
  mutate(project = as.character(project)) |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = saa_result)) + 
  scale_fill_manual(values = c("#7A1F15","#da9283","#94BFBE"),
                    name = "Result",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Result Distribution",
       subtitle = "The distribution of saa-result",
       x = "Project",
       y = "Observations",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/04_result_distribution.png",
       width = 10,  
       height = 5)

result_distribution

Patient repeats

The distribution of each patient’s latest experiment was plotted. Most of the data only contains information from the Baseline experiment. If the patients went for further experiments, most of them got the last assessment ~12 months (V04) after their Baseline.

multiple_experiments <- aug_data |>
  arrange(desc(days_from_baseline)) |>
  distinct(patient_number, .keep_all = TRUE) |>
  ggplot(mapping = aes(x = clinical_event,
                       fill = clinical_event)) + 
  geom_bar(show.legend = FALSE) + 
  scale_fill_manual(values = c("#3C3C68","#627634","#545E75","#C03221", "#94BFBE","#7A1F15","#da9283","#474B24","#474B24","#474B24","#474B24")) +
  theme_custom + 
  labs(title = "Latest observation",
       subtitle = "The latest observation of each patient",
       x = "Latest Observation",
       y = "Participants",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/04_multiple_experiments.png",
       width = 10,  
       height = 5)

multiple_experiments

Use of instrument

To analyse the samples of cerebrospinal fluid in the SAA, different equipment was used. In project 155 instrument 2, 4 and 6 were equally used. During project 237 these instruments are also used frequently in addition to instrument 5.

instruments_used <- aug_data |>
  mutate(project = as.character(project)) |>
  mutate(instrument = as.character(instrument)) |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = instrument),
           position = "fill") + 
  scale_fill_manual(breaks = c("2","4","5","6","9","10"),
                    values = c("#da9283","#7A1F15","#C03221","#94BFBE","#3C3C68","#627634"),
                    name = "Instrument",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Used Instrument",
       subtitle = "The instrument used to analyze each observation",
       x = "Project",
       y = "Ratio",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/04_instruments_used.png",
       width = 10,  
       height = 6)

instruments_used

High and low ranges of key values

To get a better understanding of the ranges of key values (fmax, auc, ttt) we look at a few examples at each end of the spectrum.

source("09_proj_func.R")

# The function returns 2 values: one from project 155 and one from 237:
low_range(aug_data, fmax, 50)
$Project_155
[1] 263.86

$Project_237
[1] 448.66
low_range(aug_data, auc, 50)
$Project_155
[1] 7723959

$Project_237
[1] 2364708
low_range(aug_data, ttt, 50)
$Project_155
[1] 38.9364

$Project_237
[1] 6.4352
source("09_proj_func.R")

# The function returns 2 values: one from project 155 and one from 237:
high_range(aug_data, fmax, 50)
$Project_155
[1] 159562.7

$Project_237
[1] 237574.2
high_range(aug_data, auc, 50)
$Project_155
[1] 36286298

$Project_237
[1] 9714900000
high_range(aug_data, ttt, 50)
$Project_155
[1] 122.1678

$Project_237
[1] 22.4912
library(patchwork)

((instruments_used + project_distribution) / multiple_experiments / result_distribution) +
  plot_layout(guides = "collect",
              heights = c(5,5,7,5)) &
  theme(legend.position = "bottom")

05_analysis_1

library(tidyverse)

Loading the augmented data

aug_data <- read_tsv("../data/03_aug_data.tsv")

source("09_proj_func.R")
aug_data <- char_type_col(aug_data)
  theme_custom <- theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 13, margin = margin(b = 7)),
    plot.subtitle = element_text(size = 10, margin = margin(b = 12)),
    plot.caption = element_text(size = 8, margin = margin(t = 20)),
      
    axis.title.x = element_text(size = 10, margin = margin(t = 10)),
    axis.title.y = element_text(size = 10, margin = margin(r = 10)),

    axis.text.x = element_text(size = 10, margin = margin(t = 5)),
    axis.text.y = element_text(size = 10, margin = margin(r = 5)),
    
    legend.position = "none",
    )

Performing the quality check of saa_result

During the augmentation a new column was made using the same criteria published by PPMI used to create their saa_result column. We check for any variations between their positive/negative/inconclusive results and the new column.

mismatches_saa_results <- aug_data |>
  filter(saa_result != saa_custom) |>
  select(project,
         patient_number,
         saa_result,
         saa_custom,
         rep,
         fmax
         ) |>
  
  pivot_wider(
    names_from = rep,
    values_from = fmax,
    names_prefix = "rep_"
    )

mismatches_saa_results
# A tibble: 7 × 7
  project patient_number saa_result saa_custom    rep_1 rep_2 rep_3
  <chr>            <dbl> <chr>      <chr>         <dbl> <dbl> <dbl>
1 155               3285 Negative   Inconclusive   7564 96765   277
2 155               3473 Negative   Inconclusive 124560   317 10256
3 155               3753 Negative   Inconclusive   5361   306 18058
4 155               3972 Negative   Inconclusive    290  8037 37230
5 155               4076 Negative   Inconclusive  75830   309  6933
6 155               4121 Negative   Inconclusive  88234 11994   317
7 155              41749 Negative   Inconclusive   4885  5708 35796

As seen above some patients have a negative saa_result, whereas when the column were recreated the results were inconclusive. When looking at the criteria given from PPMI the result should have been inconclusive we found.

Result Project 155 - Criteria of Fmax
Positive All 3 replicates have Fmax ≥ 5,000 RFU
Negative 0 or 1 replicate has Fmax ≥ 5,000 RFU
Inconclusive Exactly 2 replicates have Fmax ≥ 5,000 RFU

In most cases it looks like they decided to assign the negative value if one of the fmax repetition had a much lower value even if 2 replicates have fmax above 5000 RFU.

A visualization can be seen below:

aug_data |>
  mutate(match = saa_result == saa_custom) |>
  ggplot(aes(x = match, 
             fill = match)
         ) +
  geom_bar() +
  scale_fill_manual(values = c("#da9283","#3C3C68")) +
  facet_wrap(~ project) +
  theme_custom + 
  labs(title = "Match versus mismacth",
       subtitle = "Comparison between PPMI's saa_result and custom made results column",
       x = "Match",
       y = "Count",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/05_quality_check.png",
       width = 10,  
       height = 5)

To visualize the 7 mismatches:

mismatches_saa_results |>
  pivot_longer(
    cols = starts_with("rep"),
    names_to = "rep",
    values_to = "fmax"
  ) |>
  ggplot(aes(x = rep,
             y = fmax,
             group = patient_number)) + 
  
  geom_line(color = "#C03221",
            alpha = 0.5) +
  
  geom_point(size = 3, 
             color = "#C03221") +
  
  geom_hline(yintercept = 5000, 
             linetype = "dashed") +
  
  facet_wrap(~ patient_number, 
             scales = "free_y",
             ncol = 4) +
  
  theme_minimal() +
  theme_custom +
  
  labs(
    title = "Mismatch cases: 155 negative vs our inconclusive",
    subtitle = "7 mismatches (Reported as negative, by definition inconclusive)",
    x = "Replicate",
    y = "Fmax (RFU)",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  )

ggsave(filename = "../results/05_mismatched_saa_plot.png",
       width = 14.5,  
       height = 6.5)  

A log2 plot:

mismatches_saa_results |>
  pivot_longer(
    cols = starts_with("rep"),
    names_to = "rep",
    values_to = "fmax"
  ) |>
  ggplot(aes(x = rep,
             y = log2(fmax),
             group = patient_number)) + 
  
  geom_line(color = "#C03221",
            alpha = 0.5) +
  
  geom_point(size = 3, 
             color = "#C03221") +
  
  geom_hline(yintercept = log2(5000), 
             linetype = "dashed") +
  
  facet_wrap(~ patient_number, 
             scales = "free_y",
             ncol = 4) +
  
  theme_minimal() +
  theme_custom +
  
  labs(
    title = "Mismatch cases: 155 negative vs our inconclusive (Log2)",
    subtitle = "7 mismatches (Reported as negative, by definition inconclusive)",
    x = "Replicate",
    y = "log2 of Fmax (RFU)",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  )

ggsave(filename = "../results/05_mismatched_saa_plot_log.png",
       width = 14.5,  
       height = 6.5)  

There is maybe an extra criteria

mismatches_saa_results |>
  pivot_longer(
    cols = starts_with("rep"),
    names_to = "rep",
    values_to = "fmax"
  ) |>
  summarize(median = median(fmax),
            fmax,
            vari = max(fmax)-min(fmax),
            mean = mean(fmax),
            .by = patient_number) |> 
  
  arrange(fmax)
# A tibble: 21 × 5
   patient_number median  fmax   vari   mean
            <dbl>  <dbl> <dbl>  <dbl>  <dbl>
 1           3285   7564   277  96488 34869.
 2           3972   8037   290  36940 15186.
 3           3753   5361   306  17752  7908.
 4           4076   6933   309  75521 27691.
 5           3473  10256   317 124243 45044.
 6           4121  11994   317  87917 33515 
 7          41749   5708  4885  30911 15463 
 8           3753   5361  5361  17752  7908.
 9          41749   5708  5708  30911 15463 
10           4076   6933  6933  75521 27691.
# ℹ 11 more rows
aug_data |>
  filter(saa_result == "Inconclusive") |>
  ungroup() |>
  summarize(median = median(fmax),
            fmax,
            vari = max(fmax)-min(fmax),
            mean = mean(fmax),
            .by = patient_number) |> 
  arrange(fmax)
# A tibble: 146 × 5
   patient_number median  fmax   vari   mean
            <dbl>  <dbl> <dbl>  <dbl>  <dbl>
 1           3269 64985    246  68955 44811.
 2           3406 71973    262 107042 59846.
 3           3440  5380    264  14388  6765.
 4           3708 21002    265  34597 18710.
 5          41291 58786.   274 167947 71932.
 6           3387  8866    277  14443  7954.
 7         101124 43462    281  49603 31209 
 8          41488 36637    284  90658 39357 
 9          50157 80813    302 125574 68997 
10          41282 34362    306  90380 41785.
# ℹ 136 more rows
aug_data |>
  filter(saa_result == "Negative") |>
  ungroup() |>
  summarize(median = median(fmax),
            fmax,
            vari = max(fmax)-min(fmax),
            mean = mean(fmax),
            .by = patient_number) |> 
  arrange(fmax)
# A tibble: 954 × 5
   patient_number median  fmax   vari   mean
            <dbl>  <dbl> <dbl>  <dbl>  <dbl>
 1         100018    339   209  28111  4982.
 2         100018    339   214  28111  4982.
 3         100018    339   216  28111  4982.
 4         101092    413   216   6968  1513.
 5         101092    413   225   6968  1513.
 6         101092    413   228   6968  1513.
 7          41886    855   257   6204  1842.
 8          41314    395   265 135843 22996 
 9          71978    441   265    395   450.
10           3233    270   266  26106  8969.
# ℹ 944 more rows

06_analysis_2

Loading libraries and loading the data

library(tidyverse)

aug_data <- read_tsv("../data/03_aug_data.tsv")

#making sure that the columns is the right type
source("09_proj_func.R")
aug_data <- char_type_col(aug_data)

Defining colors and theme

my_cols <- c("#3C3C68","#627634","#C03221","#545E75", "#94BFBE", "#7A1F15",
             "#da9283" , "#474B24")

theme_custom <- theme_minimal(base_size = 14) +
  
  theme(
    plot.title = element_text(face = "bold", 
                              size = 13, 
                              margin = margin(b = 7)
                              ),
    
    plot.subtitle = element_text(size = 10, 
                                 margin = margin(b = 12)
                                 ),
    
    plot.caption = element_text(size = 8, 
                                margin = margin(t = 20)
                                ),
      
    axis.title.x = element_text(size = 10, 
                                margin = margin(t = 10)
                                ),
    
    axis.title.y = element_text(size = 10, 
                                margin = margin(r = 10)
                                ),

    axis.text.x = element_text(size = 10, 
                               margin = margin(t = 5)
                               ),
    
    axis.text.y = element_text(size = 10, 
                               margin = margin(r = 5)
                               ),
    
    legend.position = "none",
    )

Analysis of Fmax and SAA results

Seed Amplification Assays (SAA) detect α-synuclein aggregates in cerebrospinal fluid (CSF) and monitors fluorescence intensity over time. The point at which fluorescence reaches its maximum (Fmax) reflects the level of α-synuclein aggregation and is therefore a potential diagnostic indicator for Parkinson’s disease (PD).

The two projects perform the assays under different variables:

  • Project 155: 150-hour assay, fluorescence measured every 29 minutes

  • Project 237: 24-hour assay, fluorescence measured every 14 minutes

Variations in assay duration, cycle length, and reaction mixture composition mean that fluorescence values (Fmax) are not directly comparable across projects.

Because assay duration, cycle frequency, and reaction mixture differ between projects, fluorescence values (Fmax) cannot be directly compared without looking at protocol-specific characteristics.

We will look at the separation and overlap in fmax, when determining SAA result.

SAA result distribution

We will look at how the ratio of negative, positive and inconclusive is

saa_result_dist <- aug_data |>
  ggplot(mapping = aes(x = project)) + 
  geom_bar(mapping = aes(fill = saa_result),
           position = "fill") + 
  
  scale_fill_manual(values = c("#da9283","#7A1F15","#94BFBE"),
                    name = "SAA result",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "SAA result distribution",
       subtitle = "SAA result ratio from the two projects",
       x = "Project",
       y = "Ratio",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)")

ggsave(filename = "../results/06_saa_result_dist.png",
       width = 10,  
       height = 5)   

saa_result_dist

It seems to be very similar - not a very big difference in the ratio. Trying to see the separation:

Analysis of Fmax_mean as alternative test parameter

In this section we will test the variable Fmax_mean as a valid predictor of test result status. Fmax_mean is a mean calculated from the three samples collected in every assay.

Test distinction by way of Fmax_mean

First we select the 150h test group from the dataset, and then we plot the Fmax_mean grouped by their result status(positive/negative/inconclusive) and do a visual inspection and evaluate if the groups are distinguishable.

fmax_mean_results_150h <- aug_data |> 
  filter(project == "155") |> 
  ggplot(aes(
    x = saa_result,
    y = fmax_mean,
    fill = saa_result)
    ) +
  
  geom_boxplot(outlier.alpha = 0.3) +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "150h Metrics by SAA Status",
    y = "Fmax mean",
    x = NULL,
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
    ) +
  theme_custom
fmax_mean_results_150h

ggsave(filename = "../results/06_fmax_mean_results_150h.png",        
       width = 10,
       height = 5)  

From the plot we see that that the positive and negative group is distinguishable so we conduct a t-test to evaluate if we can predict what group a measurement would belong to with confidence.

First we filter for positive and negative and drop NA values.

filtered_155 <- aug_data |> 
  filter(saa_result %in% c("Positive", 
                           "Negative"), 
         project == "155") |>   
  
  drop_na(fmax_mean) |>
  
  select(patient_number, 
         saa_result, 
         fmax_mean)

Then we conduct the T-test

t.test(fmax_mean ~ saa_result, 
       data = filtered_155)

    Welch Two Sample t-test

data:  fmax_mean by saa_result
t = -87.014, df = 812.08, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Negative and group Positive is not equal to 0
95 percent confidence interval:
 -81752.50 -78145.48
sample estimates:
mean in group Negative mean in group Positive 
              4806.431              84755.423 

Here the T-test shows a highly statistically significant difference between the groups.

We repeat the steps for the 24h test group

fmax_mean_results_24h <- aug_data |>
  filter(project == "237") |>
  ggplot(aes(
    x = saa_result,
    y = fmax_mean,
    fill = saa_result)
    ) +
  
  geom_boxplot(outlier.alpha = 0.3) +
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(title = "24h Metrics by SAA Status",
       y = "Fmax mean",
       x = NULL,
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
  
  theme_custom
fmax_mean_results_24h

ggsave(filename = "../results/06_fmax_mean_results_24h.png",
       width = 10,
       height = 5)  
filtered_237 <- aug_data |>
  filter(saa_result %in% c("Positive", 
                           "Negative"),
         project == "237") |> 
  
  drop_na(fmax_mean) |>
  
  select(patient_number, 
         saa_result, 
         fmax_mean)  

filtered_237 |> count(saa_result) 
# A tibble: 2 × 2
  saa_result     n
  <chr>      <int>
1 Negative     729
2 Positive    3714
t.test(fmax_mean ~ saa_result, 
       data = filtered_237)

    Welch Two Sample t-test

data:  fmax_mean by saa_result
t = -179.74, df = 3384.4, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Negative and group Positive is not equal to 0
95 percent confidence interval:
 -139880.3 -136861.5
sample estimates:
mean in group Negative mean in group Positive 
               5466.35              143837.26 

Here we also show the ability to visually distinguish between the groups and we show a highly statistically significant difference from the T-test.

Conclusion

From these observations we have highlighted the use of Fmax_mean as an alternative diagnostic variable both for the tests done 24 and 150 hours after the sampling. Thus we can qualify both methods as viable testing methods for diagnostics. The data set includes 4 other variables that may also be suited for this(AUC, SLOPE, T50, TTT), the above process could simply be repeated for each one in order to evaluate them.

07_analysis_3

Loading libraries and loading the data

library(tidyverse)

aug_data <- read_tsv("../data/03_aug_data.tsv")

#making sure that the columns is the right type

source("09_proj_func.R")
aug_data <- char_type_col(aug_data)

Defining colors and theme

my_cols <- c("#3C3C68","#627634","#C03221","#545E75", "#94BFBE", "#7A1F15",
             "#da9283" , "#474B24")

theme_custom <- theme_minimal(base_size = 14) +
  
  theme(
    plot.title = element_text(face = "bold", 
                              size = 13, 
                              margin = margin(b = 7)
                              ),
    
    plot.subtitle = element_text(size = 10, 
                                 margin = margin(b = 12)
                                 ),
    
    plot.caption = element_text(size = 8, 
                                margin = margin(t = 20)
                                ),
      
    axis.title.x = element_text(size = 10, 
                                margin = margin(t = 10)
                                ),
    
    axis.title.y = element_text(size = 10, 
                                margin = margin(r = 10)
                                ),

    axis.text.x = element_text(size = 10, 
                               margin = margin(t = 5)
                               ),
    
    axis.text.y = element_text(size = 10, 
                               margin = margin(r = 5)
                               ),
    
    legend.position = "none",
    )

Goal of this analysis

The goal of this analysis is to evaluate the consistency of the two assays, including whether instrument differences contribute to variability.

Comparison of Overall Fmax Levels

aug_data |> 
  ggplot(
    aes(
      x = fmax_mean,
      y = project,
      color = project,
      fill = project
      )
    ) + 
  
  geom_violin(alpha = 0.5, 
              width = 0.8, 
              trim = FALSE) + 
  
  geom_boxplot(width = 0.12, 
               color = "black", 
               outlier.alpha = 0.3) +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  theme_minimal(base_size = 14) +
  
  labs(
    title = "Distribution of Mean Fmax Across SAA Projects",
    subtitle = "Comparison between 24-hour (Project 237) and 
    150-hour (Project 155) α-synuclein SAA assays",
    x = "Mean Fmax [RFU]",
    y = "Project",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) + 
  
  theme_custom +
  scale_x_continuous(labels = scales::label_comma())

ggsave(filename = "../results/07_fmax_mean_dist.png",
       width = 10,  
       height = 5)  

Although absolute values differ, the focus of this analysis is not the magnitude of Fmax but the reproducibility of replicate measurements within each project.

Consistency of Technical Replicates

Distribution of Replicate Fmax Values

Means alone can’t be used to compare the consistency of the two assays. To do this, we will compare Fmax across replicates 1–3 for each project.

aug_data |> 
  ggplot(aes(
    x = fmax, 
    y = rep, 
    color = project, 
    fill = project)
    ) +
  
  geom_boxplot(alpha = 0.7) +
  
  facet_wrap(~ project, scales = "free_x") +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Replicate Variation in Fmax",
    subtitle = "Three technical replicates per sample in Projects 155 and 237",
    x = "Fmax [RFU]",
    y = "Replicate Number",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
  theme(legend.position = "none") + 
  scale_x_continuous(labels = scales::label_comma())

ggsave(filename = "../results/07_fmax_by_rep.png",
       width = 10,  
       height = 5)  
repeats_fmax_stats <- aug_data |> 
  ungroup() |> 
  summarize(mean=mean(fmax), 
            median=median(fmax),
            IQR = quantile(fmax,.75)- quantile(fmax,.25),
            .by = c(project, rep))

Summary statistics:

Project Rep 1 Median Rep 2 Median Rep 3 Median Variation (Range)
155 72,926 70,262 69,721 3,205
237 142,138 135,570 137,200 6,568

Observations

  • Project 155: Shows a small, gradual decrease from replicate 1 to 3.

  • Project 237: Shows a dip in replicate 2, with higher values in replicates 1 and 3.

These patterns are consistent and most likely reflect behavior that is specific for protocol.

Replicate Variability (IQR)

Project Rep 1 IQR Rep 2 IQR Rep 3 IQR Variation
155 ~69,736 ~69,404 ~82,973 13,569
237 ~59,530 ~63,910 ~63,179 4,380

Interpretation

  • Project 237: shows a more “tight” replicate clustering, indicating greater precision.

  • Project 155: shows larger and less stable variability, especially in replicate 3.

Replicate Standard Deviation (SD)

Standard deviation (SD) of Fmax across replicates measures stability of the assays.

sd_saa_data <- aug_data |> 
  group_by(patient_number, rundate) |> 
  mutate(fmax_sd = sd(fmax))
sd_saa_data |> 
  filter(instrument %in% c(2, 4, 6)) |> 
  ggplot(aes(x = fmax_sd, 
             y = project, 
             fill = project)) +
  
  geom_boxplot(alpha = 0.8) +
  
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Replicate Standard Deviation of Fmax",
    subtitle = "Higher SD indicates lower assay precision",
    x = "Standard Deviation of Fmax [RFU]",
    y = "Project",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
  theme(legend.position = "none") +
  scale_x_continuous(labels = scales::label_comma())

ggsave(filename = "../results/07_sd_by_project.png",
       width = 12,  
       height = 4.3)
sd_saa_stats <- sd_saa_data |>
  ungroup() |> 
  summarize(min = min(fmax_sd), 
            mean=mean(fmax_sd), 
            median=median(fmax_sd), 
            max=max(fmax_sd),
            IQR = quantile(fmax_sd,.75)- quantile(fmax_sd,.25),
             .by = project)

Summary statistics:

Project Mean SD Median SD IQR
155 24,066 22,198 26,131
237 10,386 5,662 6,470

The 150-hour assay (155) has 2–4 times higher replicate variability than the 24-hour assay (237). This is one of the most important quality results in the entire analysis. Though, it still overlaps, and therefore its hard to make a strong conclusion.

Instrument-Specific Variability

Checking ratio of the usage of the three instruments

aug_data |>
  filter(instrument %in% c(2, 4, 6)) |> 
  ggplot(mapping = aes(x = project)) + 
  
  geom_bar(mapping = aes(fill = instrument),
           position = "fill") + 
  
  scale_fill_manual(breaks = c("2","4","5","6","9","10"),
                    values = c("#da9283","#3C3C68","#C03221","#627634","#7A1F15","#94BFBE"),
                    name = "Instrument",
                    guide = guide_legend(nrow = 1)) +
  theme_custom + 
  theme(legend.position = "bottom") + 
  labs(title = "Ratio used instruments 2, 4 and 6 ",
       subtitle = "The instrument used to analyze each observation",
       x = "Project",
       y = "Ratio")

We next examined whether any fluorescence reader (instruments 2, 4, 6) contributed disproportionately to the SD.

sd_saa_data |> 
  filter(instrument %in% c(2,4,6)) |> 
  
  ggplot(aes(
      x = fmax_sd, 
      y = instrument, 
      color = factor(instrument), 
      fill = factor(instrument)
  )) +
  
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ project, scales = "free_x") +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Instrument-Specific Variability in Fmax",
    subtitle = "Comparison across three fluorescence readers",
    x = "SD of Fmax [RFU]",
    y = "Instrument",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
  scale_x_continuous(labels = scales::label_comma())

ggsave(filename = "../results/07_sd_by_instrument.png",
       width = 11,  
       height = 7)
sd_instruments_stats <- sd_saa_data |> 
  ungroup() |> 
  summarize(mean=mean(fmax_sd), 
            median=median(fmax_sd), 
            IQR = quantile(fmax_sd,.75)- quantile(fmax_sd,.25),
             .by = c(project, instrument))

Instrument-level statistics:

Project Instr. Mean SD Median SD IQR
155 2 30,015 28,999 30,174
237 2 8,384 4,482 4,953
155 4 21,469 20,743 21,746
237 4 8,788 4,594 5,483
155 6 20,190 17,562 24,950
237 6 13,107 8,357 6,809

Interpretation

  • No single instrument is consistently “worst” or “best.”

  • Instrument effects depend on the project, indicating interaction between protocol and hardware.

  • Even the highest SD in Project 237 is lower than the lowest SD in Project 155.
    This confirms that protocol (not instrument) is the dominant source of variability.

Raw Fmax by Instruments and Replicates

To visualize replicate patterns across instruments:

aug_data |> 
  filter(instrument %in% c(2,4,6)) |> 
  
  ggplot(aes(
      x = fmax, 
      y = factor(rep), 
      color = factor(instrument),
      fill = factor(instrument)
  )) +
  
  geom_boxplot(alpha = 0.7) +
  facet_grid(instrument ~ project) +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(
    title = "Raw Fmax Across Instruments and Replicates",
    x = "Fmax [RFU]",
    y = "Replicate Number",
    caption = "Source: PPMI (Parkinson's Progression Markers Initiative)"
  ) +
  
  theme_custom +
    scale_x_continuous(labels = scales::label_comma())

ggsave(filename = "../results/07_fmax_raw_rep_inst.png",
       width = 10,  
       height = 5)  
stats_instruments_project <- aug_data |>
  ungroup() |> 
  filter(instrument == 2 | instrument == 4 | instrument == 6) |> 
  summarize(min = min(fmax), 
            mean=mean(fmax), 
            median=median(fmax), 
            max=max(fmax),
             .by = c(project, rep, instrument))

stats_instruments_project
# A tibble: 18 × 7
   project rep   instrument   min    mean  median    max
   <chr>   <chr> <chr>      <dbl>   <dbl>   <dbl>  <dbl>
 1 155     1     2            209  93224.  99592. 184313
 2 155     2     2            214  90078   98395  171306
 3 155     3     2            216  88698.  94544. 171318
 4 237     1     6            493 142075. 158205  247730
 5 237     2     6            489 128143. 143678  235264
 6 237     3     6            497 125836. 141988  238974
 7 237     1     2            518 116011. 137888. 208848
 8 237     2     2            508 112391. 133614  202797
 9 237     3     2            521 114789. 134412  211862
10 237     1     4            445 113159. 132906  250877
11 237     2     4            422 109894. 130298  244977
12 237     3     4            426 113289. 133050  257860
13 155     1     4            290  66303.  69509  136842
14 155     2     4            301  66400.  69292. 136781
15 155     3     4            300  64469.  71845  136621
16 155     1     6            265  63267.  68514  126190
17 155     2     6            264  62577.  64985  127412
18 155     3     6            246  62107.  68970  131732

Across both projects, either replicate 1 or 3 typically gives the highest Fmax.
This aligns with known SAA behavior:

  • Rep 1 often initiates early aggregation (“priming”).

  • Rep 3 often captures stabilized aggregation at late cycles.

Instrument-specific medians confirm these patterns and align with earlier SD/IQR findings.

Overall Interpretation

Across all analyses, mean Fmax, replicate medians, IQR, SD, and instrument comparisons, Project 237 shows less variability and replicate reproducibility.

Key conclusions:

  • Fmax magnitude: Differs between projects due to protocol differences; not directly comparable without looking at protocol specific factors.

  • Replicate variability: Is substantially lower in the 24-hour assay (Project 237).

  • Standard deviation: Is 2–4 times lower in Project 237, making it markedly more stable.

  • Instrument effects: Are present but small compared with protocol effects. The differences is project based, not instrument based.

  • Project 237 seems to be more consistent, but because of their overlaps in standard deviation, it is hard to make a strong conclusion.

08_analysis_4

Load libraries and data

library(tidyverse)
library(readr)
library(modelr)
aug_data <- read_tsv("../data/03_dat_aug.tsv")

Defining colors and theme

my_cols <- c("#3C3C68","#C03221","#6D8E64","#94BFBE","#545E75",  "#7A1F15",
             "#da9283" , "#474B24")

theme_custom <- theme_minimal(base_size = 14) +
  
  theme(
    plot.title = element_text(face = "bold", 
                              size = 13, 
                              margin = margin(b = 7)
                              ),
    
    plot.subtitle = element_text(size = 10, 
                                 margin = margin(b = 12)
                                 ),
    
    plot.caption = element_text(size = 8, 
                                margin = margin(t = 20)
                                ),
      
    axis.title.x = element_text(size = 10, 
                                margin = margin(t = 10)
                                ),
    
    axis.title.y = element_text(size = 10, 
                                margin = margin(r = 10)
                                ),

    axis.text.x = element_text(size = 10, 
                               margin = margin(t = 5)
                               ),
    
    axis.text.y = element_text(size = 10, 
                               margin = margin(r = 5)
                               ),
        )

Analysis

The purpose of this analysis is to see whether the test gets better at subsequent visits or rather: does the test become better further into disease progression? We investigate this by looking at mean fmax, which of course is not strictly related to positive tests, over the calculated days from baseline. This days from baseline of course does not correspond to overall disease progression but is a measure of time for each individual patient. However, we will still try to investigate it with modeling. First, an overall scatter plot.

aug_data |> 
  ggplot(aes(x = days_from_baseline, 
               y = fmax_mean)) +
  geom_point(aes(color = saa_result,
                 shape = as.factor(project))) +
  labs(title = "Mean fmax over time",
       x = "Days from baseline",
       y = "Mean fmax (RFU)",
       shape = "Project",
       color = "SAA status",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  theme_minimal() +
  theme_custom

  ggsave(filename = "../results/08_fmax_over_time.png",
       height = 5,
       width = 10)

As the fmax from the projects is not comparable, we will only look at one for the modelling. This will be project 155 as project 237 only have observations close to baseline.

Below a linear model is fitted as well as a residual plot.

sim1_mod <- lm(fmax_mean ~ days_from_baseline, 
               data = filter(aug_data, project == 237))

coef(sim1_mod)
       (Intercept) days_from_baseline 
      1.194281e+05       9.479028e-01 
grid <- aug_data|> 
  filter(project == 237) |> 
  data_grid(days_from_baseline)
grid <- grid |> 
  add_predictions(sim1_mod) 
aug_data |> 
  filter(project == 237) |> 
  ggplot(aes(x = days_from_baseline, 
             y = fmax_mean)) +
  geom_point(aes(color = saa_result)) +
  geom_line(aes(y = pred), 
            data = grid, 
            colour = "red", 
            size = 1) + 
  
  labs(title = "Mean fmax over time for project 237 with linear fit",
       x = "Days from baseline",
       y = "Mean fmax (RFU)",
       shape = "Project",
       color = "SAA status",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  theme_minimal() +
  theme_custom
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

ggsave(filename = "../results/08_fmax_over_time_lm.png",
       height = 5,
       width = 12)
resid <- filter(aug_data, project == 237) |> 
  add_residuals(sim1_mod)

ggplot(resid, aes(days_from_baseline, 
                  resid)) + 
  geom_ref_line(h = 0) +
  geom_point(color = "#545E75") +
  theme_minimal() +
  
  labs(title = "Residual plot for lm of fmax over time for project 237",
       x = "Days from baseline",
       y = "Resid",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
  
  theme_custom +
  scale_y_continuous(labels = scales::label_comma())

ggsave(filename = "../results/08_fmax_over_time_resid.png",
       height = 5,
       width = 10)

This really tells us nothing. Fitting to a linear model does not make sense, as expected from the scatter plot, which showed no clear pattern.

This may be because that days from baseline is a bad indicator for disease progression, so for this reason we want to look at the development for each patient. Only a few patients had multiple samples taken, so first we look at the 12 patients with the most patients.

top_12 <- aug_data |> 
  count(patient_number, sort = TRUE) |> 
  head(12)

aug_data |> 
  filter(patient_number %in% top_12$patient_number) |> 
    ggplot(aes(x = days_from_baseline, 
               y = fmax_mean)) +
  
  geom_line(color = "gray") +
  
  geom_point(aes(color = saa_result,
                 shape = as.factor(project))) +
  
  facet_wrap(~ patient_number) +
  theme_minimal() +
  
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  scale_x_continuous(n.breaks = 3) +
  
  labs(title = "Mean fmax over time for the 12 patients with the most visits",
       x = "Days from baseline",
       y = "Mean fmax (RFU)",
       shape = "Project",
       color = "SAA status",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
  
  theme_custom

ggsave(filename = "../results/08_fmax_over_time_top12.png",
       height = 5,
       width = 12)

This last bit of code allows looking at a random sample from the top 100 of patients with the most visits.

top_100 <- aug_data |> 
  count(patient_number, sort = TRUE) |> 
  head(100)

sample <- top_100 |> 
  sample_n(20)

aug_data |> 
  filter(patient_number %in% sample$patient_number) |> 
  ggplot(aes(x = days_from_baseline, 
               y = fmax_mean)) +
  
  geom_line(color = "gray") +

  geom_point(aes(color = saa_result,
                 shape = as.factor(project))) +
  theme_minimal() +
  scale_x_continuous(n.breaks = 3) +
  facet_wrap(~ patient_number) +
  scale_color_manual(values = my_cols) +
  scale_fill_manual(values = my_cols) +
  
  labs(title = "Mean fmax over time for random patients with the most visits",
       x = "Days from baseline",
       y = "Mean fmax (RFU)",
       shape = "Project",
       color = "SAA status",
       caption = "Source: PPMI (Parkinson's Progression Markers Initiative)") +
  
  theme_custom